TRAIN AND TEST DATA
# Load train data
train <- pseq(subset = "train")
print(train)
phyloseq-class experiment-level object
otu_table() OTU Table: [ 5748 taxa and 3615 samples ]
sample_data() Sample Data: [ 3615 samples by 12 sample variables ]
tax_table() Taxonomy Table: [ 5748 taxa by 7 taxonomic ranks ]
test <- pseq(subset = "test")
print(test)
phyloseq-class experiment-level object
otu_table() OTU Table: [ 5748 taxa and 1809 samples ]
sample_data() Sample Data: [ 1809 samples by 12 sample variables ]
tax_table() Taxonomy Table: [ 5748 taxa by 7 taxonomic ranks ]
EXPLORING SYNTHETIC CLINICAL DATA
# Check clinical variables
pheno <- data.frame(sample_data(train))
pheno <- pheno[complete.cases(pheno), ] # remove NA's
pheno[, c(3:8, 12)] <- apply(pheno[, c(3:8, 12)], 2, function(x) ifelse(x == 1, "yes", "no"))
tab <- createTable(compareGroups(Event ~ ., data = pheno))
export2md(tab)
Summary descriptives table by groups of `Event’
|
|
no
|
yes
|
p.overall
|
|
|
N=3242
|
N=298
|
|
|
Age
|
49.0 (14.8)
|
55.5 (13.8)
|
<0.001
|
|
BodyMassIndex
|
27.0 (4.68)
|
28.0 (5.11)
|
0.002
|
|
Smoking:
|
|
|
0.039
|
|
no
|
2466 (76.1%)
|
243 (81.5%)
|
|
|
yes
|
776 (23.9%)
|
55 (18.5%)
|
|
|
BPTreatment:
|
|
|
0.307
|
|
no
|
2744 (84.6%)
|
245 (82.2%)
|
|
|
yes
|
498 (15.4%)
|
53 (17.8%)
|
|
|
PrevalentDiabetes:
|
|
|
0.806
|
|
no
|
3042 (93.8%)
|
278 (93.3%)
|
|
|
yes
|
200 (6.17%)
|
20 (6.71%)
|
|
|
PrevalentCHD:
|
|
|
0.024
|
|
no
|
3142 (96.9%)
|
281 (94.3%)
|
|
|
yes
|
100 (3.08%)
|
17 (5.70%)
|
|
|
PrevalentHFAIL:
|
|
|
0.926
|
|
no
|
3157 (97.4%)
|
291 (97.7%)
|
|
|
yes
|
85 (2.62%)
|
7 (2.35%)
|
|
|
Event_time
|
13.9 (5.62)
|
12.5 (6.13)
|
<0.001
|
|
SystolicBP
|
136 (21.8)
|
141 (24.7)
|
0.001
|
|
NonHDLcholesterol
|
4.07 (1.08)
|
4.19 (1.23)
|
0.122
|
|
Sex:
|
|
|
0.604
|
|
no
|
1808 (55.8%)
|
161 (54.0%)
|
|
|
yes
|
1434 (44.2%)
|
137 (46.0%)
|
|
# View table (without NA's)
DT::datatable(pheno)
# Remove artifacts
pheno2 <- pheno[-which(pheno$PrevalentHFAIL == "yes" &
pheno$Event == "yes"), ]
# Plot Event time distribution
hist(pheno$Event_time[which(pheno$Event == "yes")], 10)

hist(pheno$Event_time[which(pheno$Event == "no")], 10)

# Plot Age
hist(pheno$Age[which(pheno$Event == "yes")], 10)

hist(pheno$Age[which(pheno$Event == "no")], 10)

# Plot BMI
hist(pheno$BodyMassIndex[which(pheno$Event == "yes")], 10)

hist(pheno$BodyMassIndex[which(pheno$Event == "no")], 10)

MICROBIOME PREPROCESSING PIPELINE
# Load data
train <- pseq(subset = "train")
test <- pseq(subset = "test")
# Remove samples
train <- remove_samples(train,
remove_nas = TRUE,
remove_neg = FALSE)
[1] "Samples with NA's in time or event have been removed!"
# Agglomerate by Species
train <- tax_glom(train, taxrank = "Species")
train <- subset_taxa(train, Species != "s__")
test <- tax_glom(test, taxrank = "Species")
test <- subset_taxa(test, Species != "s__")
# Calculate richness
richness_train <- estimate_richness(
train,
split = TRUE,
measures = c("Shannon", "Observed",
"Chao1", "Simpson",
"InvSimpson", "Fisher"))
richness_test <- estimate_richness(
test,
split = TRUE,
measures = c("Shannon", "Observed",
"Chao1", "Simpson",
"InvSimpson", "Fisher"))
# Filter taxa by counts
train <- filter_taxa(train,
function(x) sum(x > 2) > (0.8 * length(x)), TRUE)
# Calculate co-abundances
coab_taxa <- co_abundances(train)
print(table(coab_taxa$clusters))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
17 2 14 18 3 20 13 21 2 7 8 13 2 5 21 6 12 2 19 6 19 10 2 7
DT::datatable(coab_taxa)
# Caculate GSVA
scores <- get_scores(coab_taxa, train, test, method = "gsva")
Estimating GSVA scores for 24 gene sets.
Estimating ECDFs with Gaussian kernels
|
| | 0%
|
|=== | 4%
|
|======= | 8%
|
|========== | 12%
|
|============= | 17%
|
|================= | 21%
|
|==================== | 25%
|
|======================= | 29%
|
|=========================== | 33%
|
|============================== | 38%
|
|================================= | 42%
|
|===================================== | 46%
|
|======================================== | 50%
|
|=========================================== | 54%
|
|=============================================== | 58%
|
|================================================== | 62%
|
|===================================================== | 67%
|
|========================================================= | 71%
|
|============================================================ | 75%
|
|=============================================================== | 79%
|
|=================================================================== | 83%
|
|====================================================================== | 88%
|
|========================================================================= | 92%
|
|============================================================================= | 96%
|
|================================================================================| 100%
Estimating GSVA scores for 24 gene sets.
Estimating ECDFs with Gaussian kernels
|
| | 0%
|
|=== | 4%
|
|======= | 8%
|
|========== | 12%
|
|============= | 17%
|
|================= | 21%
|
|==================== | 25%
|
|======================= | 29%
|
|=========================== | 33%
|
|============================== | 38%
|
|================================= | 42%
|
|===================================== | 46%
|
|======================================== | 50%
|
|=========================================== | 54%
|
|=============================================== | 58%
|
|================================================== | 62%
|
|===================================================== | 67%
|
|========================================================= | 71%
|
|============================================================ | 75%
|
|=============================================================== | 79%
|
|=================================================================== | 83%
|
|====================================================================== | 88%
|
|========================================================================= | 92%
|
|============================================================================= | 96%
|
|================================================================================| 100%
DT::datatable(scores$train)
DT::datatable(scores$test)
# Create train and test data
pheno_train <- sample_data(train)
x_train <- data.frame(pheno_train[, -c(8, 9)],
richness_train,
scores$train)
y_train <- pheno_train[, c("Event_time", "Event")]
pheno_test <- sample_data(test)
x_test <- data.frame(pheno_test[, -c(8, 9)],
richness_test,
scores$test)
y_test <- pheno_test[, c("Event_time", "Event")]
PREPARE DATA TO TRAIN
# Creating train and test data
train <- cbind.data.frame(x_train, y_train)
test <- cbind.data.frame(x_test, y_test)
# What do we do with ...
# ======
# Negatives survival values in train and test?
train <- train[-which(train$Event_time < 0), ]
test$Event_time[which(test$Event_time < 0)] <- 15
# NA's values in train and test?
train <- train[complete.cases(train), ]
test <- missRanger(test, pmm.k = 10, seed = 153)
Missing value imputation by random forests
Variables to impute: Smoking, PrevalentDiabetes, PrevalentCHD, PrevalentHFAIL, SystolicBP, NonHDLcholesterol, Event_time, Event
Variables used to impute: Age, BodyMassIndex, Smoking, BPTreatment, PrevalentDiabetes, PrevalentCHD, PrevalentHFAIL, SystolicBP, NonHDLcholesterol, Sex, Observed, Chao1, se.chao1, Shannon, Simpson, InvSimpson, Fisher, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6, cluster_7, cluster_8, cluster_9, cluster_10, cluster_11, cluster_12, cluster_13, cluster_14, cluster_15, cluster_16, cluster_17, cluster_18, cluster_19, cluster_20, cluster_21, cluster_22, cluster_23, cluster_24, cluster_25, Event_time, Event
iter 1: ........
iter 2: ........
iter 3: ........
iter 4: ........
iter 5: ........
# Removing PrevalentHFAIL (only 0)
train <- train[, -grep("PrevalentHFAIL", colnames(train))]
test <- test[, -grep("PrevalentHFAIL", colnames(test))]
DT::datatable(train)
Warning in instance$preRenderHook(instance): It seems your data is too big for client-
side DataTables. You may consider server-side processing: https://rstudio.github.io/DT/
server.html
FIT THE MODEL
# Fit model
# ========
cvrts <- c("Age", "BodyMassIndex",
"SystolicBP", "NonHDLcholesterol",
"Sex")
(biomarkers <- setdiff(colnames(train), c(cvrts, colnames(y_train))))
[1] "Smoking" "BPTreatment" "PrevalentDiabetes" "PrevalentCHD"
[5] "Observed" "Chao1" "se.chao1" "Shannon"
[9] "Simpson" "InvSimpson" "Fisher" "cluster_1"
[13] "cluster_2" "cluster_3" "cluster_4" "cluster_5"
[17] "cluster_6" "cluster_7" "cluster_8" "cluster_9"
[21] "cluster_10" "cluster_11" "cluster_12" "cluster_13"
[25] "cluster_14" "cluster_15" "cluster_16" "cluster_17"
[29] "cluster_18" "cluster_19" "cluster_20" "cluster_21"
[33] "cluster_22" "cluster_23" "cluster_24" "cluster_25"
models <- fit_biospear(data = train,
biomarkers = biomarkers,
surv = c("Event_time", "Event"),
cvrts = cvrts,
inter = FALSE,
methods = "lasso")
Computing selection with method: lasso
print(models)
$lasso
Age BodyMassIndex SystolicBP NonHDLcholesterol Sex
0.044578010 0.012260426 0.002131799 -0.095917491 0.033412726
PrevalentCHD InvSimpson cluster_5
0.055240331 -0.017648201 0.106224418
PREDICT IN TEST
environment(predRes2) <- asNamespace("biospear")
assignInNamespace("predRes", predRes2, ns = "biospear")
prediction <- predRes2(res = models,
method = "lasso",
traindata = train,
newdata = test,
int.cv = FALSE,
int.cv.nfold = 5,
time = seq(1, 16, 1),
trace = TRUE,
ncores = 20)
Computing prediction criteria for: training set
Computing prediction criteria for: validation set
t_train <- train$Event_time
e_train <- train$Event
t_test <- test$Event_time
e_test <- test$Event
m <- names(models)
lapply(m, function(i) {
s_train <- prediction$scores_train[, i]
s_test <- prediction$scores_extval[, i]
# Concordance
c_train <- rcorr.cens(exp(-s_train), Surv(t_train, e_train), outx = FALSE)
c_test <- rcorr.cens(exp(-s_test), Surv(t_test, e_test), outx = FALSE)
# Print C-Index
print(paste0("C-Index in train set ", i, " is: ", c_train[1]))
print(paste0("C-Index in test set ", i, " is: ", c_test[1]))
})
[1] "C-Index in train set lasso is: 0.719786125733815"
[1] "C-Index in test set lasso is: 0.71177703945034"
[[1]]
[1] "C-Index in test set lasso is: 0.71177703945034"
SAVE OUTPUT
# Save scores
scores <- data.frame(
SampleID = rownames(test),
Score = prediction$scores_extval[, 1],
time = t_test,
event = e_test,
age = test$Age
)
print(hist(scores$Score))

$breaks
[1] 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4.0 4.2
$counts
[1] 33 90 142 140 151 153 140 167 155 178 164 136 96 55 8 1
$density
[1] 0.091210614 0.248756219 0.392482034 0.386954118 0.417357656 0.422885572 0.386954118
[8] 0.461580984 0.428413488 0.491984522 0.453289110 0.375898286 0.265339967 0.152017689
[15] 0.022111664 0.002763958
$mids
[1] 1.1 1.3 1.5 1.7 1.9 2.1 2.3 2.5 2.7 2.9 3.1 3.3 3.5 3.7 3.9 4.1
$xname
[1] "scores$Score"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
DT::datatable(scores)
print(plot(scores$Score, scores$age))

NULL